####################################################
#Author: Kelli Marquardt
#Purpose: Produce NHIS vs Empirical Sample Comparison Table (b3)

# Inputs:
#-  data/est_dat_fake.csv 
#-  data/nhis_00004.xml
#-  data/nhis_00004.dat


# Outputs:
#-  output/tables/tab_b3.txt

####################################################



############################
#0 load required packages
############################
rm(list = ls(all.names = TRUE))

#load packages 
library(ipumsr)
library(data.table)
library(dplyr)



#################################################
#Step 1: read in the sample data and get the relevant stats 
#################################################

est_dat = read.csv(file.path("..", "data", "est_dat_fake.csv"), stringsAsFactors = FALSE)

dat_Q=c(mean(est_dat$Qi), 
        mean(est_dat$Qi[which(est_dat$male==1)])/
        mean(est_dat$Qi[which(est_dat$male==0)]))
dat_D=c(mean(est_dat$Di), 
        mean(est_dat$Di[which(est_dat$male==1)])/
        mean(est_dat$Di[which(est_dat$male==0)]))
dat_Dcondit=c(mean(est_dat$Di[which(est_dat$Qi==1)]),
              mean(est_dat$Di[which(est_dat$male==1 & est_dat$Qi==1)])/
              mean(est_dat$Di[which(est_dat$male==0 & est_dat$Qi==1)]))

#no longer need est_dat
rm(est_dat)

############################
#Step 2a: Read in ipums data extract and clean
############################

# read data extract and keep those age 5-17
data = ipumsr::read_ipums_micro(file.path("..", "data", "nhis_00004.xml")) %>%
  filter(AGE <= 17 & AGE >= 5)
setDT(data)


# Keep all in universe of SAWGEN (note, this is the same as the universe for DVINT and other relevant health questions)
data = data[SAWGEN != 0,]

# replace DREMOPROB to no if NIU
data[DREMOPROB == 0, DREMOPROB := 1]


# drop observations in which the following variables are missing (i.e., if they are unknown-refused, unknown-ascertained, unknown-don't know etc) :  DVINT, SAWGEN, VISITYRNO, CHECKUP 
data = data[!DVINT %in% c(997:999) & 
              !SAWGEN %in% 7:9 & 
              !VISITYRNO %in% 97:99 & 
              !CHECKUP %in% 7:9] 

# drop observations in which the following variables are unknown : SAWMENT & DREMOPROB 
data = data[!SAWMENT %in% 7:9 & 
              !DREMOPROB %in% 7:9] 


############################
#Step 2b: Define relevant statistics for table
############################

# Visit_in_4 = 1 if DVINT  %in% 200:399, NA if DVINT >= 997 & 0 otherwise 
data[, visit_in_4 := ifelse(DVINT %in% 200:399, 1, NA)]
data[, visit_in_4 := ifelse(is.na(visit_in_4) & DVINT < 997, 0, visit_in_4)]

#  AnyVisit=1 if (DVINT in 200-301 | SAWGEN=2 |  VISITYRNO in 10-44 | CHECKUP =2 | SAWMENT=2 | DREMOPROB=2) and 0 otherwise. 
data[, any_visit := ifelse(
  DVINT %in% 200:301 | 
    SAWGEN == 2 | 
    VISITYRNO %in% 10:44 | 
    CHECKUP == 2 | 
    SAWMENT == 2 | 
    DREMOPROB == 2, 1, 0
)]


# AnyBehav=1 if (SAWMENT=2 | DREMOPROB=2) and 0 otherwise. 
data[, any_behav := ifelse(
  SAWMENT == 2 | DREMOPROB == 2, 1, 0
)]

# ADHD_Dx=1 if ADDEV=2, =0 if ADDEV=1, and NA otherwise. 
data[, adhd_dx := ifelse(ADDEV == 2, 1, NA)]
data[, adhd_dx := ifelse(ADDEV == 1, 0, adhd_dx)] 



############################
#Step 3: Produce stats for unconditional means and ratios, and conditional means and ratios
############################

#unconditional means and ratios

visit_in_4_wtd_mean = round(weighted.mean(data$visit_in_4, w = data$SAMPWEIGHT, na.rm = T),3)
visit_in_4_ratio = round(weighted.mean(data$visit_in_4[data$SEX == 1], w = data$SAMPWEIGHT[data$SEX == 1], na.rm = T) / 
                           weighted.mean(data$visit_in_4[data$SEX == 2], w = data$SAMPWEIGHT[data$SEX == 2], na.rm = T),3)

annual_visit_wtd_mean = round(weighted.mean(data$any_visit, w = data$SAMPWEIGHT, na.rm = T), 3)
annual_visit_ratio = round(weighted.mean(data$any_visit[data$SEX == 1], w = data$SAMPWEIGHT[data$SEX == 1], na.rm = T) /
                             weighted.mean(data$any_visit[data$SEX == 2], w = data$SAMPWEIGHT[data$SEX == 2], na.rm = T), 3)

adhd_wtd_mean = round(weighted.mean(data$adhd_dx, w = data$SAMPWEIGHT, na.rm = T),3)
adhd_ratio = round(weighted.mean(data$adhd_dx[data$SEX == 1], w = data$SAMPWEIGHT[data$SEX == 1], na.rm = T) / 
                     weighted.mean(data$adhd_dx[data$SEX == 2], w = data$SAMPWEIGHT[data$SEX == 2], na.rm = T),2)




#conditional means and ratios

data_any = data[any_visit == 1, ]


behav_wtd_mean = round(weighted.mean(data_any$any_behav, w = data_any$SAMPWEIGHT, na.rm = T),3)
behav_ratio = round(weighted.mean(data_any$any_behav[data_any$SEX == 1], w = data_any$SAMPWEIGHT[data_any$SEX == 1], na.rm = T) / 
                      weighted.mean(data_any$any_behav[data_any$SEX == 2], w = data_any$SAMPWEIGHT[data_any$SEX == 2], na.rm = T),2)



data_behav = data[any_behav == 1, ]

adhd_cond_behav_wtd_mean = round(weighted.mean(data_behav$adhd_dx, w = data_behav$SAMPWEIGHT, na.rm = T),3)
adhd_cond_behav_ratio = round(weighted.mean(data_behav$adhd_dx[data_behav$SEX == 1], w = data_behav$SAMPWEIGHT[data_behav$SEX == 1], na.rm = T) / 
                                weighted.mean(data_behav$adhd_dx[data_behav$SEX == 2], w = data_behav$SAMPWEIGHT[data_behav$SEX == 2], na.rm = T),2)



############################
#Step 4: Build and Save the comparison table
############################


#table for nhis sample  
nhis_tab=data.frame(stat_name=c('Visit in 4 Years', 
                          'Annual Visit',
                          'Behavioral Visit $\\mid$ Annual',
                          'ADHD (ever)',
                          'ADHD (ever) $\\mid$ Behavioral Visit'),
                    overall=c(visit_in_4_wtd_mean, annual_visit_wtd_mean, behav_wtd_mean, adhd_wtd_mean, adhd_cond_behav_wtd_mean),
                    ratio=c(visit_in_4_ratio, annual_visit_ratio, behav_ratio, adhd_ratio, adhd_cond_behav_ratio))

#table for empirical sample
ehr_tab=data.frame(stat_name=c('$Q_i=1$', 
                                '$D_i=1$',
                                '$D_i=1|Q_i=1$'),
                    overall=c(dat_Q[1], dat_D[1], dat_Dcondit[1]),
                    ratio=c(dat_Q[2], dat_D[2], dat_Dcondit[2]))




###############
header=paste("\\begin{tabular}{lcc|lcc}\n",
             "\\toprule \n",
             "& Overall & Male:Female  & & Overall & Male:Female \\\\\n",
             "\\textbf{NHIS Sample} & & Ratio & \\textbf{EHR Sample} & & Ratio \\\\\n",
             "\\midrule \n")
footer=paste("\\bottomrule\n",
             "\\end{tabular}\n")

#build body 
body=""
for(i in 1:2){
  body=paste(body,
             sprintf("%s & %.3f & %.3f & - & - & - \\\\\n",
                     nhis_tab[i,1], nhis_tab[i,2], nhis_tab[i,3]))
}
for (i in 3:3){
  body=paste(body,  
             sprintf("%s & %.3f & %.2f & %s & %.3f & %.2f \\\\\n",
                            nhis_tab[i,1], nhis_tab[i,2], nhis_tab[i,3],
                            ehr_tab[(i-2),1], ehr_tab[(i-2),2], ehr_tab[(i-2),3]),
             "\\midrule \n ")
}
for(i in 4:5){
  body=paste(body, 
             sprintf("%s & %.3f & %.2f & %s & %.3f & %.2f \\\\\n",
                     nhis_tab[i,1], nhis_tab[i,2], nhis_tab[i,3],
                     ehr_tab[(i-2),1], ehr_tab[(i-2),2], ehr_tab[(i-2),3]))
}

hold=paste(header, body, footer, sep="")
write(hold, file = file.path("..", "output", "tables", "tab_b3.txt"))

#END OF SCRIPT
